In [1]:
import graphlab
In [2]:
song_data = graphlab.SFrame('song_data.gl/')
In [3]:
song_data.head()
Out[3]:
In [4]:
graphlab.canvas.set_target('ipynb')
In [5]:
song_data['song'].show()
In [6]:
len(song_data)
Out[6]:
In [7]:
users = song_data['user_id'].unique()
In [8]:
len(users)
Out[8]:
In [20]:
kanye_songs = song_data[song_data['artist'] == 'Kanye West']
foo_songs = song_data[song_data['artist'] == 'Foo Fighters']
swift_songs = song_data[song_data['artist'] == 'Taylor Swift']
gaga_songs = song_data[song_data['artist'] == 'Lady GaGa']
In [21]:
kanye_users = kanye_songs['user_id'].unique()
foo_users = foo_songs['user_id'].unique()
swift_users = swift_songs['user_id'].unique()
gaga_users = gaga_songs['user_id'].unique()
In [25]:
print "{} {} {} {}".format(len(kanye_users), len(foo_users), len(swift_users), len(gaga_users))
In [29]:
listener_count = song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')})
In [33]:
listener_count = listener_count.sort('total_count', ascending=False)
listener_count.head()
Out[33]:
In [34]:
listener_count = listener_count.sort('total_count', ascending=True)
listener_count.head()
Out[34]:
In [ ]:
In [9]:
train_data,test_data = song_data.random_split(.8,seed=0)
In [10]:
popularity_model = graphlab.popularity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [ ]:
popularity_model.recommend(users=[users[0]])
In [ ]:
popularity_model.recommend(users=[users[1]])
In [ ]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [ ]:
personalized_model.recommend(users=[users[0]])
In [ ]:
personalized_model.recommend(users=[users[1]])
In [ ]:
personalized_model.get_similar_items(['With Or Without You - U2'])
In [ ]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])
In [ ]:
if graphlab.version[:3] >= "1.6":
model_performance = graphlab.compare(test_data, [popularity_model, personalized_model], user_sample=0.05)
graphlab.show_comparison(model_performance,[popularity_model, personalized_model])
else:
%matplotlib inline
model_performance = graphlab.recommender.util.compare_models(test_data, [popularity_model, personalized_model], user_sample=.05)
The curve shows that the personalized model provides much better performance.
In [35]:
train_data,test_data = song_data.random_split(.8,seed=0)
In [39]:
item_similarity_model = graphlab.item_similarity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [40]:
subset_test_users = test_data['user_id'].unique()[0:10000]
In [43]:
rec_songs = item_similarity_model.recommend(subset_test_users,k=1)
In [45]:
rec_songs.head()
Out[45]:
In [47]:
song_count = rec_songs.groupby(key_columns='song', operations={'count': graphlab.aggregate.COUNT()})
In [49]:
song_count.sort('count', ascending=False)
Out[49]:
In [ ]: